Source code for nlp_architect.solutions.absa_solution.sentiment_solution

# ******************************************************************************
# Copyright 2017-2019 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import argparse
import json
from os import PathLike
from os.path import isdir
from typing import Optional

import numpy as np
import pandas as pd
from tqdm import tqdm

from nlp_architect.common.core_nlp_doc import CoreNLPDoc
from nlp_architect.models.absa.inference.data_types import TermType, \
    SentimentDocEncoder, SentimentDoc
from nlp_architect.models.absa.inference.inference \
    import SentimentInference
from nlp_architect.models.absa.utils import load_opinion_lex
from nlp_architect.solutions.absa_solution import SENTIMENT_OUT
from nlp_architect.solutions.absa_solution.utils import Anonymiser, _ui_format
from nlp_architect.utils.io import walk_directory, validate_existing_filepath, \
    validate_existing_directory, validate_existing_path, line_count


[docs]class SentimentSolution(object): """Main class for executing Sentiment Solution pipeline. Args: anonymiser (Anonymiser, optional): Method to anonymise events' text. max_events (int, optional): Maximum number of events to show for each aspect-polarity pair. """ def __init__(self, anonymiser: Anonymiser = None, max_events: int = 400): self.anonymiser = anonymiser self.max_events = max_events SENTIMENT_OUT.mkdir(parents=True, exist_ok=True)
[docs] def run(self, aspect_lex: PathLike = None, opinion_lex: PathLike = None, data: PathLike = None, parsed_data: PathLike = None, inference_results: PathLike = None) -> Optional[pd.DataFrame]: opinions = load_opinion_lex(opinion_lex) if not opinions: raise ValueError('Empty opinion lexicon!') aspects = pd.read_csv(aspect_lex, header=None, encoding='utf-8')[0] if aspects.empty: raise ValueError('Empty aspect lexicon!') if inference_results: with open(inference_results, encoding='utf-8') as f: results = json.loads(f.read(), object_hook=SentimentDoc.decoder) elif data or parsed_data: inference = SentimentInference(aspect_lex, opinions, parse=False) parse = None if not parsed_data: # source data is raw text, need to parse from nlp_architect.pipelines.spacy_bist import SpacyBISTParser parse = SpacyBISTParser().parse results = {} print('Running inference on data files... (Iterating data files)') data_source = parsed_data if parsed_data else data for file, doc in self._iterate_docs(data_source): parsed_doc = parse(doc) if parse \ else json.loads(doc, object_hook=CoreNLPDoc.decoder) sentiment_doc = inference.run(parsed_doc=parsed_doc) if sentiment_doc: results[file] = sentiment_doc with open(SENTIMENT_OUT / 'inference_results.json', 'w', encoding='utf-8') as f: json.dump(results, f, cls=SentimentDocEncoder, indent=4, sort_keys=True) else: print('No input given. Please supply one of: ' 'data directory, parsed data directory, or inference results.') return None print("\nComputing statistics...") stats = self._compute_stats(results, aspects, opinions) print("Done.") return stats
@staticmethod def _iterate_docs(data: PathLike) -> tuple: if isdir(data): for file, doc_text in tqdm(list(walk_directory(data))): yield file, doc_text else: with open(data, encoding='utf-8') as f: for i, doc_text in tqdm(enumerate(f), total=line_count(data)): yield str(i + 1), doc_text def _compute_stats(self, results: dict, aspects: list, opinion_lex: dict) -> pd.DataFrame: """Aggregates counts for each aspect-polarity pairs, with separate counts for in-domain only events. """ index = pd.MultiIndex.from_product([aspects, ['POS', 'NEG'], [False, True]], names=['Aspect', 'Polarity', 'inDomain']) stats = pd.DataFrame(columns=['Quantity', 'Score'], index=index) stats[['Quantity', 'Score']] = stats[['Quantity', 'Score']].fillna(0) stats = stats.sort_index() scores = stats.copy() for doc in tqdm(results.values()): for sent in doc.sentences: for event in sent.events: aspect = [t for t in event if t.type == TermType.ASPECT][0] opinion = [t for t in event if t.type == TermType.OPINION][0] score = aspect.score key = aspect.text, aspect.polarity.name count = self._add_event(stats, key, False, score) in_domain = opinion_lex[opinion.text.lower()].is_acquired count_dom = self._add_event(stats, key, True, score) if in_domain else -1 if count <= self.max_events: sent_ui = _ui_format(sent, doc) self._add_sentence(sent_ui, stats, scores, key, False, count, score) if in_domain: self._add_sentence(sent_ui, stats, scores, key, True, count_dom, score) for key in index: # sort sentences according to their scores stats.loc[key, 2:] = stats.loc[key][2:][np.argsort(scores.loc[key][2:])].tolist() return stats def _add_sentence(self, sent_ui: str, stats: pd.DataFrame, scores: pd.DataFrame, key: tuple, in_domain: bool, count: int, score: int) -> int: """Utility function for adding event sentence to output.""" sent_ui = self.anonymiser.run(sent_ui) if self.anonymiser else sent_ui sent_key = key + (in_domain,), 'Sent_' + str(count) stats.at[sent_key] = sent_ui scores.at[sent_key] = -abs(score) return count @staticmethod def _add_event(df: pd.DataFrame, key: tuple, in_domain: bool, score: int) -> int: """Utility function for incrementing event counts.""" key = key + (in_domain,) count = int(df.loc[key, 'Quantity']) + 1 df.loc[key, 'Quantity'] = count df.loc[key, 'Score'] += score return count
[docs]def main() -> None: parser = argparse.ArgumentParser(description='Aspect-Based Sentiment Analysis') parser.add_argument('--data', type=validate_existing_path, help='Path to data') parser.add_argument('--aspects', type=validate_existing_filepath, help='Path to aspect lexicon', required=True) parser.add_argument('--opinions', type=validate_existing_filepath, help='Path to opinion lexicon', required=True) parser.add_argument('--parsed', type=validate_existing_directory, help='Path to parsed data') parser.add_argument('--res', type=validate_existing_filepath, help='Path to inference results') args = parser.parse_args() solution = SentimentSolution() solution.run(data=args.data, parsed_data=args.parsed, inference_results=args.res, aspect_lex=args.aspects, opinion_lex=args.opinions)
if __name__ == '__main__': main()